This package aims to perform metabolic pathway-based subytping of patient tumor samples from gene expression data.
The main steps are:
1. Perform GSVA on cancer patient gene expression data using KEGG metabolic pathways.
2. Perform k-means clustering on the GSVA matrix, identify metabolic subtypes. Optimal number of k is defined based on data or can be user-specified.
3. Summarize pathway activity per cluster: i.e. mean pathway activity per cluster, or do differential testing.
4. Perform Kaplan-Meier (KM) analysis with clusters if survival data are present.
…
# Set wd to current location:
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
########## Load libraries
library("tidyverse")
library("fgsea")
# devtools::install_github('dBenedek/MetabolicExpressR')
library("MetabolicExpressR")
########## Load data
# KEGG metabolic pathways gene set:
kegg_gs <- gmtPathways("../data/kegg_metabolic_human_20211026.gmt")
# CPTAC-HNSCC
gene_exp_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_RNAseq_RSEM_UQ_log2_Normal.cct",
sep = "\t", stringsAsFactors = F, header = T) %>%
column_to_rownames("Idx")
clinical_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_CLI.tsi",
sep = "\t", header = T, stringsAsFactors = F)
clinical_cptac_hnscc <- clinical_cptac_hnscc[-1, ] %>%
mutate(case_id = str_replace_all(case_id, "-", "\\."), overall_survival = as.numeric(overall_survival),
overall_free_status = as.numeric(overall_free_status), progression_free_status = as.numeric(progression_free_status),
progression_free_survival = as.numeric(progression_free_survival)) %>%
filter(P16 != "Positive (>70% nuclear and cytoplasmic staining)" & case_id %in%
colnames(gene_exp_cptac_hnscc))
# TCGA-ACC
gene_exp_tcga_acc <- read.table("../data/TCGA_ACC/ACC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_acc <- gene_exp_tcga_acc[-1, ] %>%
mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
column_to_rownames("Hybridization.REF") %>%
mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_acc) <- colnames(gene_exp_tcga_acc) %>%
str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_acc <- read.table("../data/TCGA_ACC/HS_CPTAC_ACC_CLI.tsi", sep = "\t",
header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_cptac_acc <- clinical_cptac_acc %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
# TCGA-UVM
gene_exp_tcga_uvm <- read.table("../data/TCGA_UVM/UVM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_uvm <- gene_exp_tcga_uvm[-1, ] %>%
mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
column_to_rownames("Hybridization.REF") %>%
mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_uvm) <- colnames(gene_exp_tcga_uvm) %>%
str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_uvm <- read.table("../data/TCGA_UVM/HS_CPTAC_UVM_CLI.tsi", sep = "\t",
header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_cptac_uvm <- clinical_cptac_uvm %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
# TCGA-LAML
gene_exp_tcga_laml <- read.table("../data/TCGA_LAML/LAML.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_laml <- gene_exp_tcga_laml[-1, ] %>%
mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
column_to_rownames("Hybridization.REF") %>%
mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_laml) <- colnames(gene_exp_tcga_laml) %>%
str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_laml <- read.table("../data/TCGA_LAML/HS_CPTAC_LAML_CLI.tsi", sep = "\t",
header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_cptac_laml <- clinical_cptac_laml %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
# CPTAC-COAD data:
gene_exp_cptac_coad <- read.table("../data/CPTAC_COAD/Human__CPTAC_COAD__UNC__RNAseq__HiSeq_RNA__03_01_2017__BCM__Gene__BCM_RSEM_UpperQuartile_log2.cct",
sep = "\t", stringsAsFactors = F, header = T) %>%
column_to_rownames("attrib_name")
# CPTAC-COAD: There is no survival days in survival data
# TCGA-COAD data:
gene_exp_tcga_coad <- read.table(gzfile("../data/TCGA_COAD/Human__TCGA_COADREAD__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct.gz"),
sep = "\t", stringsAsFactors = F, header = T) %>%
column_to_rownames("attrib_name")
clinical_tcga_coad <- read.table("../data/TCGA_COAD/HS_CPTAC_TCGA_COAD_CLI.tsi",
sep = "\t", header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_tcga_coad <- clinical_tcga_coad %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
GSVA: gene set variation analysis for microarray and RNA-Seq data
This might take some time, especially for larger (n > 100) data sets.
# CPTAC-HNSC data:
gsva_metab_cptac_hnscc <- run_gsva_metabolic(gene_exp_cptac_hnscc, kcdf = "Gaussian",
kegg_gs = kegg_gs)
# TCGA-ACC data:
gsva_metab_tcga_acc <- run_gsva_metabolic(gene_exp_tcga_acc, kcdf = "Gaussian", kegg_gs = kegg_gs)
# TCGA-UVM data:
gsva_metab_tcga_uvm <- run_gsva_metabolic(gene_exp_tcga_uvm, kcdf = "Gaussian", kegg_gs = kegg_gs)
# TCGA-LAML data:
gsva_metab_tcga_laml <- run_gsva_metabolic(gene_exp_tcga_laml, kcdf = "Gaussian",
kegg_gs = kegg_gs)
# TCGA-COAD data:
gsva_metab_tcga_coad <- run_gsva_metabolic(gene_exp_tcga_coad, kcdf = "Gaussian",
kegg_gs = kegg_gs)
# CPTAC-COAD data:
gsva_metab_cptac_coad <- run_gsva_metabolic(gene_exp_cptac_coad, kcdf = "Gaussian",
kegg_gs = kegg_gs)
It is recommended to use the optimal number of k for the clustering step. This is selected based on several indices included in the NbClust R library. Otherwise, you can set the parameters user_def_k = TRUE and k = ... in the kmeans_gsva_metabolic() function.
# CPTAC-HNSC data:
kmeans_clusters_cptac_hnscc <- kmeans_gsva_metabolic(gsva_metab_cptac_hnscc, kegg_gs = kegg_gs)
# TCGA-ACC data:
kmeans_clusters_tcga_acc <- kmeans_gsva_metabolic(gsva_metab_tcga_acc, kegg_gs = kegg_gs)
# TCGA-UVM data:
kmeans_clusters_tcga_uvm <- kmeans_gsva_metabolic(gsva_metab_tcga_uvm, kegg_gs = kegg_gs)
# TCGA-LAML data:
kmeans_clusters_tcga_laml <- kmeans_gsva_metabolic(gsva_metab_tcga_laml, kegg_gs = kegg_gs)
# TCGA-COAD data:
kmeans_clusters_tcga_coad <- kmeans_gsva_metabolic(gsva_metab_tcga_coad, kegg_gs = kegg_gs,
user_def_k = TRUE, k = 2)
# CPTAC-COAD data:
kmeans_clusters_cptac_coad <- kmeans_gsva_metabolic(gsva_metab_cptac_coad, kegg_gs = kegg_gs)
kmeans_clusters_cptac_hnscc$heatmap
kmeans_clusters_tcga_acc$heatmap
kmeans_clusters_tcga_uvm$heatmap
kmeans_clusters_tcga_laml$heatmap
kmeans_clusters_tcga_coad$heatmap
kmeans_clusters_cptac_coad$heatmap
plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_hnscc, kegg_gs = kegg_gs,
kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_acc, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_uvm, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_laml, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_coad, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_coad, kegg_gs = kegg_gs,
kmeans_res = kmeans_clusters_cptac_coad$kmean_res)
From the website of PROGENy: “PROGENy is resource that leverages a large compendium of publicly available signaling perturbation experiments to yield a common core of pathway responsive genes for human and mouse. These, coupled with any statistical method, can be used to infer pathway activities from bulk or single-cell transcriptomics.”
# CPTAC-HNSC data:
progeny_cptac_hnsc <- run_progeny(gene_exp_data = gene_exp_cptac_hnscc, kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)
progeny_cptac_hnsc$progeny_boxplot
# TCGA-ACC data:
progeny_tcga_acc <- run_progeny(gene_exp_data = gene_exp_tcga_acc, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)
progeny_tcga_acc$progeny_boxplot
# TCGA-UVM data:
progeny_tcga_uvm <- run_progeny(gene_exp_data = gene_exp_tcga_uvm, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)
progeny_tcga_uvm$progeny_boxplot
# TCGA-LAML data:
progeny_tcga_lam <- run_progeny(gene_exp_data = gene_exp_tcga_laml, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)
progeny_tcga_lam$progeny_boxplot
# TCGA-COAD data:
progeny_tcga_coad <- run_progeny(gene_exp_data = gene_exp_tcga_coad, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)
progeny_tcga_coad$progeny_boxplot
# CPTAC-COAD data:
progeny_cptac_coad <- run_progeny(gene_exp_data = gene_exp_cptac_coad, kmeans_res = kmeans_clusters_cptac_coad$kmean_res)
progeny_cptac_coad$progeny_boxplot
# CPTAC-HNSC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res, clinical_data = clinical_cptac_hnscc,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-ACC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_acc$kmean_res, clinical_data = clinical_cptac_acc,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-UVM data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_uvm$kmean_res, clinical_data = clinical_cptac_uvm,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-LAML data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_laml$kmean_res, clinical_data = clinical_cptac_laml,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-COAD data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_coad$kmean_res, clinical_data = clinical_tcga_coad,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] MetabolicExpressR_0.0.1.0 fgsea_1.25.2
## [3] lubridate_1.9.2 forcats_1.0.0
## [5] stringr_1.5.0 dplyr_1.1.2
## [7] purrr_1.0.1 readr_2.1.4
## [9] tidyr_1.3.0 tibble_3.2.1
## [11] ggplot2_3.4.2 tidyverse_2.0.0
## [13] BiocStyle_2.28.1
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.14
## [3] jsonlite_1.8.4 shape_1.4.6
## [5] magrittr_2.0.3 magick_2.8.0
## [7] farver_2.1.1 rmarkdown_2.21
## [9] GlobalOptions_0.1.2 zlibbioc_1.45.0
## [11] vctrs_0.6.2 Cairo_1.6-1
## [13] memoise_2.0.1 DelayedMatrixStats_1.22.0
## [15] RCurl_1.98-1.12 rstatix_0.7.2
## [17] htmltools_0.5.5 S4Arrays_1.0.4
## [19] broom_1.0.4 Rhdf5lib_1.22.0
## [21] rhdf5_2.44.0 sass_0.4.5
## [23] bslib_0.4.2 plyr_1.8.8
## [25] zoo_1.8-12 cachem_1.0.7
## [27] commonmark_1.9.0 lifecycle_1.0.3
## [29] iterators_1.0.14 pkgconfig_2.0.3
## [31] rsvd_1.0.5 Matrix_1.6-3
## [33] R6_2.5.1 fastmap_1.1.1
## [35] GenomeInfoDbData_1.2.10 MatrixGenerics_1.11.1
## [37] clue_0.3-64 digest_0.6.31
## [39] colorspace_2.1-0 AnnotationDbi_1.61.2
## [41] S4Vectors_0.37.7 DESeq2_1.39.8
## [43] irlba_2.3.5.1 GenomicRanges_1.51.4
## [45] RSQLite_2.3.1 ggpubr_0.6.0
## [47] beachmat_2.16.0 labeling_0.4.2
## [49] km.ci_0.5-6 fansi_1.0.4
## [51] timechange_0.2.0 abind_1.4-5
## [53] httr_1.4.5 compiler_4.3.3
## [55] bit64_4.0.5 withr_2.5.0
## [57] doParallel_1.0.17 backports_1.4.1
## [59] BiocParallel_1.33.12 viridis_0.6.3
## [61] carData_3.0-5 DBI_1.1.3
## [63] highr_0.10 HDF5Array_1.28.1
## [65] ggsignif_0.6.4 DelayedArray_0.26.3
## [67] rjson_0.2.21 tools_4.3.3
## [69] progeny_1.22.0 glue_1.6.2
## [71] rhdf5filters_1.12.1 gridtext_0.1.5
## [73] grid_4.3.3 cluster_2.1.6
## [75] reshape2_1.4.4 generics_0.1.3
## [77] gtable_0.3.3 KMsurv_0.1-5
## [79] tzdb_0.3.0 survminer_0.4.9
## [81] data.table_1.14.8 hms_1.1.3
## [83] xml2_1.3.3 car_3.1-2
## [85] BiocSingular_1.16.0 ScaledMatrix_1.8.1
## [87] utf8_1.2.3 XVector_0.39.0
## [89] BiocGenerics_0.45.3 markdown_1.6
## [91] ggrepel_0.9.3 foreach_1.5.2
## [93] pillar_1.9.0 GSVA_1.48.0
## [95] splines_4.3.3 circlize_0.4.15
## [97] ggtext_0.1.2 lattice_0.22-5
## [99] survival_3.5-7 bit_4.0.5
## [101] annotate_1.77.0 tidyselect_1.2.0
## [103] ComplexHeatmap_2.16.0 SingleCellExperiment_1.22.0
## [105] locfit_1.5-9.7 Biostrings_2.67.2
## [107] knitr_1.42 gridExtra_2.3
## [109] bookdown_0.34 IRanges_2.33.1
## [111] SummarizedExperiment_1.29.1 stats4_4.3.3
## [113] xfun_0.39 Biobase_2.59.0
## [115] matrixStats_0.63.0 stringi_1.7.12
## [117] yaml_2.3.7 evaluate_0.20
## [119] codetools_0.2-19 NbClust_3.0.1
## [121] BiocManager_1.30.20 graph_1.78.0
## [123] cli_3.6.1 xtable_1.8-4
## [125] munsell_0.5.0 jquerylib_0.1.4
## [127] survMisc_0.5.6 Rcpp_1.0.10
## [129] GenomeInfoDb_1.35.17 png_0.1-8
## [131] XML_3.99-0.14 parallel_4.3.3
## [133] blob_1.2.4 sparseMatrixStats_1.12.0
## [135] bitops_1.0-7 viridisLite_0.4.1
## [137] GSEABase_1.62.0 scales_1.2.1
## [139] crayon_1.5.2 GetoptLong_1.0.5
## [141] rlang_1.1.0 cowplot_1.1.1
## [143] fastmatch_1.1-3 KEGGREST_1.39.0
## [145] formatR_1.14